import warnings
import os
os.environ["OMP_NUM_THREADS"] = "1"
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.cluster import KMeans
from sklearn.datasets import load_wine
#Load dataset
data=load_wine()
X=data.data
y=data.target
X
array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
1.065e+03],
[1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
1.050e+03],
[1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
1.185e+03],
...,
[1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
8.350e+02],
[1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
8.400e+02],
[1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
5.600e+02]])
data["feature_names"]
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
print("Original Data. Shape:", X.shape)
Original Data. Shape: (178, 13)
df=pd.DataFrame(data["data"],columns=data["feature_names"])
df
| alcohol | malic_acid | ash | alcalinity_of_ash | magnesium | total_phenols | flavanoids | nonflavanoid_phenols | proanthocyanins | color_intensity | hue | od280/od315_of_diluted_wines | proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.23 | 1.71 | 2.43 | 15.6 | 127.0 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065.0 |
| 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100.0 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050.0 |
| 2 | 13.16 | 2.36 | 2.67 | 18.6 | 101.0 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185.0 |
| 3 | 14.37 | 1.95 | 2.50 | 16.8 | 113.0 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480.0 |
| 4 | 13.24 | 2.59 | 2.87 | 21.0 | 118.0 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 173 | 13.71 | 5.65 | 2.45 | 20.5 | 95.0 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740.0 |
| 174 | 13.40 | 3.91 | 2.48 | 23.0 | 102.0 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750.0 |
| 175 | 13.27 | 4.28 | 2.26 | 20.0 | 120.0 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835.0 |
| 176 | 13.17 | 2.59 | 2.37 | 20.0 | 120.0 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840.0 |
| 177 | 14.13 | 4.10 | 2.74 | 24.5 | 96.0 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560.0 |
178 rows × 13 columns
#Visualize original data using first two features
plt.figure(figsize=(8, 6))
plt.scatter(X[:,0],X[:,1],c=y, cmap='viridis',edgecolor='k')
plt.title("Original Data (First Two Features)")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.colorbar(label="Classes")
plt.grid(True)
plt.show()
#PairPlot
X_norm_df=pd.DataFrame(X,columns=data.feature_names)
X_norm_df["target"]=y
sns.pairplot(X_norm_df,hue="target",palette="viridis",diag_kind="kde")
plt.suptitle("Pairwise Relationships of Standardized Features",y=1.02)
plt.show()
#Correlation Matrix
cm=np.corrcoef(X.T)
plt.figure(figsize=(8,6))
sns.heatmap(cm,annot=True,fmt=".2f",cmap="coolwarm",xticklabels=data.feature_names,yticklabels=data.feature_names)
plt.title("Correlation Matrix - Heatmap")
plt.show()
#Normalize the data
scaler=StandardScaler()
X_normalized=scaler.fit_transform(X)
X_normalized
array([[ 1.51861254, -0.5622498 , 0.23205254, ..., 0.36217728,
1.84791957, 1.01300893],
[ 0.24628963, -0.49941338, -0.82799632, ..., 0.40605066,
1.1134493 , 0.96524152],
[ 0.19687903, 0.02123125, 1.10933436, ..., 0.31830389,
0.78858745, 1.39514818],
...,
[ 0.33275817, 1.74474449, -0.38935541, ..., -1.61212515,
-1.48544548, 0.28057537],
[ 0.20923168, 0.22769377, 0.01273209, ..., -1.56825176,
-1.40069891, 0.29649784],
[ 1.39508604, 1.58316512, 1.36520822, ..., -1.52437837,
-1.42894777, -0.59516041]])
print("Normalized Data. Shape:", X_normalized.shape)
Normalized Data. Shape: (178, 13)
# Apply SVD
U,S,VT=np.linalg.svd(X_normalized,full_matrices=False)
#Display singular values
print("Singular Values:\n", S)
Singular Values: [28.94203422 21.08225141 16.04371561 12.78973645 12.32374195 10.68713954 9.90368818 7.8760733 7.17081793 6.6828618 6.33958815 5.48097635 4.28967045]
U.shape,S.shape,VT.shape
((178, 13), (13,), (13, 13))
#Compute explained variance
explained_variance=(S**2)/(X_normalized.shape[0]-1)
explained_variance_ratio=explained_variance/np.sum(explained_variance)
print("Explained Variance Ratio:",explained_variance_ratio)
Explained Variance Ratio: [0.36198848 0.1920749 0.11123631 0.0706903 0.06563294 0.04935823 0.04238679 0.02680749 0.02222153 0.01930019 0.01736836 0.01298233 0.00795215]
#Visualize explained variance ratio
plt.figure(figsize=(8,5))
vrbar=plt.bar(range(1,len(S) + 1),explained_variance_ratio,alpha=0.7,label="Explained Variance Ratio")
for i,vrbar in enumerate(vrbar):
plt.text(vrbar.get_x()+vrbar.get_width()/2,vrbar.get_height(),f'{explained_variance_ratio[i]:.2f}', ha='center', va='bottom')
plt.xlabel("Principal Components")
plt.ylabel("Variance Ratio")
plt.title("Explained Variance per Component")
plt.legend()
plt.show()
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
print("Cumulative Explained Variance:",cumulative_explained_variance)
#Choose the number of components - to obtain 95% Variance
k=np.argmax(cumulative_explained_variance>=0.95)+1
print(f"\nNumber of components chosen (95% variance): {k}")
#Plot explained variance
plt.figure(figsize=(8,5))
plt.plot(cumulative_explained_variance,marker='o',label='Cumulative Explained Variance')
plt.axhline(y=0.95,color='r',linestyle='--',label='95% Variance')
plt.axvline(x=k, color='g',linestyle='--',label=f'{k} Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.legend()
plt.show()
Cumulative Explained Variance: [0.36198848 0.55406338 0.66529969 0.73598999 0.80162293 0.85098116 0.89336795 0.92017544 0.94239698 0.96169717 0.97906553 0.99204785 1. ] Number of components chosen (95% variance): 10
# Loadings: Contribution of each feature to principal components
loadings=VT[:k].T[:,:k]
print(f"Top {k} Principal Components Loadings:\n", loadings)
Top 10 Principal Components Loadings: [[-0.1443294 0.48365155 -0.20738262 0.0178563 -0.26566365 0.21353865 -0.05639636 0.39613926 -0.50861912 0.21160473] [ 0.24518758 0.22493093 0.08901289 -0.53689028 0.03521363 0.53681385 0.42052391 0.06582674 0.07528304 -0.30907994] [ 0.00205106 0.31606881 0.6262239 0.21417556 -0.14302547 0.15447466 -0.14917061 -0.17026002 0.30769445 -0.02712539] [ 0.23932041 -0.0105905 0.61208035 -0.06085941 0.06610294 -0.10082451 -0.28696914 0.42797018 -0.20044931 0.05279942] [-0.14199204 0.299634 0.13075693 0.35179658 0.72704851 0.03814394 0.3228833 -0.15636143 -0.27140257 0.06787022] [-0.39466085 0.06503951 0.14617896 -0.19806835 -0.14931841 -0.0841223 -0.02792498 -0.40593409 -0.28603452 -0.32013135] [-0.4229343 -0.00335981 0.1506819 -0.15229479 -0.10902584 -0.01892002 -0.06068521 -0.18724536 -0.04957849 -0.16315051] [ 0.2985331 0.02877949 0.17036816 0.20330102 -0.50070298 -0.25859401 0.59544729 -0.23328465 -0.19550132 0.21553507] [-0.31342949 0.03930172 0.14945431 -0.39905653 0.13685982 -0.53379539 0.37213935 0.36822675 0.20914487 0.1341839 ] [ 0.0886167 0.52999567 -0.13730621 -0.06592568 -0.07643678 -0.41864414 -0.22771214 -0.03379692 -0.05621752 -0.29077518] [-0.29671456 -0.27923515 0.08522192 0.42777141 -0.17361452 0.10598274 0.23207564 0.43662362 -0.08582839 -0.52239889] [-0.37616741 -0.16449619 0.16600459 -0.18412074 -0.10116099 0.26585107 -0.0447637 -0.07810789 -0.1372269 0.52370587] [-0.28675223 0.36490283 -0.12674592 0.23207086 -0.1578688 0.11972557 0.0768045 0.12002267 0.57578611 0.162116 ]]
# Create a heatmap with Loadings
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap="coolwarm", xticklabels=[f"PC{i+1}" for i in range(loadings.shape[1])],
yticklabels=data.feature_names, cbar_kws={"label": "Contribution"})
plt.title("Heatmap of Feature Contributions to Principal Components")
plt.xlabel("Principal Components")
plt.ylabel("Features")
plt.show()
def pcs_impl(X_normalized,U,S,VT,k):
#Select top k components - truncating Smaller Singular Values
U_k=U[:,:k]
S_k=np.diag(S[:k])
VT_k=VT[:k,:]
# Reduce dimensions using the top k components - Project the data onto the subspace defined by top k principal components
X_reduced=X_normalized @ VT_k.T # We can also Use U_k@S_k
return X_reduced
X_reduced=pcs_impl(X_normalized,U,S,VT,k)
print(f"Shape of reduced data: {X_reduced.shape}")
Shape of reduced data: (178, 10)
#Visualization of the first two principal components - PC's
plt.figure(figsize=(8, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='viridis', edgecolor='k')
plt.title("Projection onto First Two Principal Components")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Classes")
plt.show()
#The Choice of k vs Classification accuracy with and without PCA
#Split the data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y)
#Function to perform With and Without PCA and evaluate model performance using Logistic Regression
def evaluate_with_and_without_pca_performance(X_train,X_test,y_train,y_test,k_range):
performance={"k":[],"Accuracy Without PCA":[],"Accuracy With PCA":[]}
# Normalize the data
scaler=StandardScaler()
X_train_normalized=scaler.fit_transform(X_train)
X_test_normalized=scaler.transform(X_test)
# Train and evaluate without PCA
model=LogisticRegression(max_iter=1000,random_state=42)
model.fit(X_train_normalized,y_train)
y_pred=model.predict(X_test_normalized)
accuracy_no_pca=accuracy_score(y_test,y_pred)
U,S,VT=np.linalg.svd(X_train_normalized,full_matrices=False)
# Plot configuration for 3x3 subplot grid
plt.figure(figsize=(15,15))
for i,k in enumerate(k_range,1):
# Apply PCA with SVD by having k components
X_train_pca=pcs_impl(X_train_normalized,U,S,VT,k)
X_test_pca=pcs_impl(X_test_normalized,U,S,VT,k)
#Train and evaluate with PCA
model_pca=LogisticRegression(max_iter=1000,random_state=42)
model_pca.fit(X_train_pca,y_train)
y_pred_pca=model_pca.predict(X_test_pca)
accuracy_with_pca=accuracy_score(y_test,y_pred_pca)
#Save results
performance["k"].append(k)
performance["Accuracy Without PCA"].append(accuracy_no_pca)
performance["Accuracy With PCA"].append(accuracy_with_pca)
#Plot the first two principal components for each value of k
plt.subplot(3,3,i)
plt.scatter(X_train_pca[:,0],X_train_pca[:,1],c=y_train,cmap='viridis',edgecolor='k',s=50)
plt.title(f"PCA Projection for k={k}\nX_train_pca shape: {X_train_pca.shape}\nX_test_pca shape: {X_test_pca.shape}")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Classes")
plt.grid(True)
plt.tight_layout()
plt.show()
return pd.DataFrame(performance)
# Range of k (number of components)
k_range=range(2,11)
# Evaluate model performance
performance_df=evaluate_with_and_without_pca_performance(X_train,X_test,y_train,y_test,k_range)
# Display the results
print(performance_df)
k Accuracy Without PCA Accuracy With PCA 0 2 0.972222 1.000000 1 3 0.972222 0.972222 2 4 0.972222 0.972222 3 5 0.972222 0.972222 4 6 0.972222 0.944444 5 7 0.972222 0.944444 6 8 0.972222 0.944444 7 9 0.972222 0.972222 8 10 0.972222 0.972222
# Plot the results
plt.figure(figsize=(10,6))
plt.plot(performance_df["k"],performance_df["Accuracy Without PCA"],label="Accuracy Without PCA",marker='o')
plt.plot(performance_df["k"],performance_df["Accuracy With PCA"],label="Accuracy With PCA",marker='o')
plt.xlabel("Number of Principal Components (k)")
plt.ylabel("Accuracy")
plt.title("Model Performance with and without PCA")
plt.legend()
plt.grid(True)
plt.show()
# Function to perform With and Without PCA and evaluate model performance using Random Forest
def evaluate_with_and_without_pca_performance_rf(X_train,X_test,y_train,y_test,k_range):
performance={"k":[],"Accuracy Without PCA":[],"Accuracy With PCA":[]}
# Normalize the data
scaler=StandardScaler()
X_train_normalized=scaler.fit_transform(X_train)
X_test_normalized=scaler.transform(X_test)
# Train and evaluate without PCA using Random Forest
rf_model=RandomForestClassifier(random_state=42)
rf_model.fit(X_train_normalized,y_train)
y_pred=rf_model.predict(X_test_normalized)
accuracy_no_pca=accuracy_score(y_test,y_pred)
# Perform SVD on the normalized training data
U,S,VT=np.linalg.svd(X_train_normalized,full_matrices=False)
# Plot configuration for 3x3 subplot grid
plt.figure(figsize=(15,15))
for i,k in enumerate(k_range,1):
# Apply PCA with SVD by having k components
X_train_pca=pcs_impl(X_train_normalized,U,S,VT,k)
X_test_pca=pcs_impl(X_test_normalized,U,S,VT,k)
# Train and evaluate with PCA using Random Forest
rf_model_pca=RandomForestClassifier(random_state=42)
rf_model_pca.fit(X_train_pca,y_train)
y_pred_pca=rf_model_pca.predict(X_test_pca)
accuracy_with_pca=accuracy_score(y_test,y_pred_pca)
# Save results
performance["k"].append(k)
performance["Accuracy Without PCA"].append(accuracy_no_pca)
performance["Accuracy With PCA"].append(accuracy_with_pca)
# Plot the first two principal components for each value of k
plt.subplot(3,3,i)
plt.scatter(X_train_pca[:,0],X_train_pca[:,1],c=y_train,cmap='viridis',edgecolor='k',s=50)
plt.title(f"PCA Projection for k={k}\nX_train_pca shape: {X_train_pca.shape}\nX_test_pca shape: {X_test_pca.shape}")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Classes")
plt.grid(True)
plt.tight_layout()
plt.show()
return pd.DataFrame(performance)
# Evaluate model performance
performance_df_rf=evaluate_with_and_without_pca_performance_rf(X_train,X_test,y_train,y_test,k_range)
# Display the results
print(performance_df_rf)
k Accuracy Without PCA Accuracy With PCA 0 2 1.0 0.972222 1 3 1.0 0.972222 2 4 1.0 0.972222 3 5 1.0 1.000000 4 6 1.0 0.972222 5 7 1.0 1.000000 6 8 1.0 0.972222 7 9 1.0 1.000000 8 10 1.0 1.000000
# Plot the results
plt.figure(figsize=(10,6))
plt.plot(performance_df_rf["k"],performance_df_rf["Accuracy Without PCA"],label="Accuracy Without PCA",marker='o')
plt.plot(performance_df_rf["k"],performance_df_rf["Accuracy With PCA"],label="Accuracy With PCA",marker='o')
plt.xlabel("Number of Principal Components (k)")
plt.ylabel("Accuracy")
plt.title("Model Performance with and without PCA")
plt.legend()
plt.grid(True)
plt.show()